For this walk-through we will be using the same example (and much of the same code!) as in this chapter - https://uclouvain-cbio.github.io/WSBIM2122/sec-rnaseq.html
For a basic intro to R and dplyr, please watch this series of videos made by Duke’s Center for Computational Thinking - https://warpwire.duke.edu/w/f0YGAA/
To learn more about the DESeq2 package, please refer to this wonderful guide - http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#exploratory-analysis-and-visualization
# system.file() finds the location of the specified file in the specified package
# load() then opens the specified file (`.rda` objects in this case) so that they appear in our global environment
load(system.file("extdata/deseq2/counts.rda",
package = "rWSBIM2122"))
load(system.file("extdata/deseq2/coldata.rda",
package = "rWSBIM2122"))
# you should now see two new objects in your Environment pane (top right-hand corner)
# `coldata` is a small dataframe, so just type its name in the console and press enter to see its contents
coldata
# `counts` is a much larger object. How many rows and columns does it have?
dim(counts)
## [1] 58735 6
# Take a look at the first few rows of `counts`
head(counts)
# Looking at these two dataframes, what kind of analysis/comparison would you perform?
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = coldata,
design = ~ Condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# How do we learn more about this new function, DESeqDataSetFromMatrix() from the DESeq2 package
?DESeq2::DESeqDataSetFromMatrix
# take a look at this new `dds` object
dds
## class: DESeqDataSet
## dim: 58735 6
## metadata(1): version
## assays(1): counts
## rownames(58735): ENSG00000223972 ENSG00000227232 ... ENSG00000277475
## ENSG00000268674
## rowData names(0):
## colnames(6): sample1 sample2 ... sample5 sample6
## colData names(3): Cell Type Condition
# `dds` is a new type formally known as an "S4 object"
# S4 objects have slots that can be accessed by using the `@` symbol
# to see our two original objects `coldata` and `counts`, we could run the following
dds@colData
## DataFrame with 6 rows and 3 columns
## Cell Type Condition
## <character> <character> <factor>
## sample1 Cell1 Epithelial mock
## sample2 Cell1 Epithelial mock
## sample3 Cell1 Epithelial mock
## sample4 Cell1 Epithelial KD
## sample5 Cell1 Epithelial KD
## sample6 Cell1 Epithelial KD
head(dds@assays@data[[1]])
## sample1 sample2 sample3 sample4 sample5 sample6
## ENSG00000223972 0 0 0 0 0 1
## ENSG00000227232 14 28 17 40 16 13
## ENSG00000278267 8 4 3 1 1 6
## ENSG00000243485 0 0 0 0 0 0
## ENSG00000284332 0 0 0 0 0 0
## ENSG00000237613 0 0 0 0 0 0
# There are functions that allow us to access elements of `dds` in a more intuitive way
# use the `counts()` function to access the original `counts` dataframe
head(counts(dds))
## sample1 sample2 sample3 sample4 sample5 sample6
## ENSG00000223972 0 0 0 0 0 1
## ENSG00000227232 14 28 17 40 16 13
## ENSG00000278267 8 4 3 1 1 6
## ENSG00000243485 0 0 0 0 0 0
## ENSG00000284332 0 0 0 0 0 0
## ENSG00000237613 0 0 0 0 0 0
# we can also use the `assay()` function for this
# use the `colData()` function to access the original (almost!) `coldata` dataframe
colData(dds)
## DataFrame with 6 rows and 3 columns
## Cell Type Condition
## <character> <character> <factor>
## sample1 Cell1 Epithelial mock
## sample2 Cell1 Epithelial mock
## sample3 Cell1 Epithelial mock
## sample4 Cell1 Epithelial KD
## sample5 Cell1 Epithelial KD
## sample6 Cell1 Epithelial KD
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# this function does a lot!
# let's ignore all of this for now and take a look at the results :)
head(results(dds))
## log2 fold change (MLE): Condition mock vs KD
## Wald test p-value: Condition mock vs KD
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972 0.164582 -0.964328 4.080473 -0.236327 0.813179
## ENSG00000227232 21.005516 -0.295976 0.539816 -0.548290 0.583493
## ENSG00000278267 4.127792 1.035234 1.142151 0.906390 0.364729
## ENSG00000243485 0.000000 NA NA NA NA
## ENSG00000284332 0.000000 NA NA NA NA
## ENSG00000237613 0.000000 NA NA NA NA
## padj
## <numeric>
## ENSG00000223972 NA
## ENSG00000227232 0.740534
## ENSG00000278267 0.555793
## ENSG00000243485 NA
## ENSG00000284332 NA
## ENSG00000237613 NA
# Notice that the log2FoldChange has the "KD" group as the base comparison
# why is this?
dds$Condition
## [1] mock mock mock KD KD KD
## Levels: KD mock
class(dds$Condition)
## [1] "factor"
# `Condition` is a factor, meaning that it is a categorical variable, which is a variable that contains a fixed number of categories or groups that a given observation can belong to
# The number of levels of a given factor refer to the number of categories this variable contains
# The order of these levels determines how comparisons are made in statistical modeling
# In the case of `Condition`, "KD" appears first and so is the base comparison group
# To switch this order, we can run this -
coldata$Condition <- factor(coldata$Condition, levels = c("mock", "KD"))
# check levels
coldata$Condition
## [1] mock mock mock KD KD KD
## Levels: mock KD
# levels have been switched!
# create dds object again
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = coldata,
design = ~ Condition)
## converting counts to integer mode
# run DESeq again
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)
head(res)
## log2 fold change (MLE): Condition KD vs mock
## Wald test p-value: Condition KD vs mock
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972 0.164582 0.964295 4.080473 0.236319 0.813185
## ENSG00000227232 21.005516 0.295977 0.539816 0.548293 0.583491
## ENSG00000278267 4.127792 -1.035232 1.142151 -0.906388 0.364730
## ENSG00000243485 0.000000 NA NA NA NA
## ENSG00000284332 0.000000 NA NA NA NA
## ENSG00000237613 0.000000 NA NA NA NA
## padj
## <numeric>
## ENSG00000223972 NA
## ENSG00000227232 0.740531
## ENSG00000278267 0.555790
## ENSG00000243485 NA
## ENSG00000284332 NA
## ENSG00000237613 NA
# visualize results using plotMA()
plotMA(res)
res_shrunk <- lfcShrink(dds,
coef = "Condition_KD_vs_mock",
type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Here's a nice article on empirical Bayes estimation - http://varianceexplained.org/r/empirical_bayes_baseball/
plotMA(res_shrunk)
# compare this plot with the previous one. How are they different?
# convert to dataframe
res_df <- as_tibble(res_shrunk, rownames = "ENSEMBL") # if we forget `rownames = `, we lose all the gene names!
# How would you remove all the rows that have `NAs` for the `padj` column
# We have a couple of options
# clue -
head(is.na(res_df$padj)) # gives us a bunch of TRUEs and FALSEs. We can use this to filter out the unwanted rows
## [1] TRUE FALSE FALSE TRUE TRUE TRUE
res_df_nona <- res_df %>%
filter(!is.na(padj))
# or we could use the `drop_na()` function
res_df_nona <- res_df %>%
drop_na(padj)
# how to verify that there are no missing values for `padj`?
sum(is.na(res_df_nona$padj)) # this should be 0!
## [1] 0
res_df <- res_df_nona
To learn how to use ggplot2, refer to this video - https://www.youtube.com/watch?v=WUwSVKasU9g
res_df %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj),
color = padj < 0.05 & abs(log2FoldChange) > 1)) +
geom_point(size = 0.5) +
geom_hline(yintercept = -log10(0.05)) +
geom_vline(xintercept = 1) +
geom_vline(xintercept = -1) +
theme(legend.position = "none")
For a more detailed explanation of enrichment analyses (including ORA and gene set enrichment analysis), please watch this video - https://youtu.be/ZgZKmAYm-LE
# Get ENTREZ IDs
# We'll need this for using the "org.Hs.eg.db" package
# Step One: Connect to the selected BioMart database and dataset hosted by Ensembl
ensembl <- useEnsembl(biomart = "genes",
dataset = "hsapiens_gene_ensembl")
# Step Two: Retrieve gene names
# build a biomaRt query
# The getBM() function is the main query function in biomaRt
ensembl_to_entrez <- getBM(attributes = c("ensembl_gene_id", "external_gene_name",
"entrezgene_id"),
values = res_df$ENSEMBL,
mart = ensembl)
# Plan B in case there's a connection problem -
# ensembl_to_entrez <- read_csv("https://raw.githubusercontent.com/dukecct/CBRG/main/inst/data/ensembl_to_entrez.csv")
# add this new info to res_df_nona
res_df <- res_df %>%
left_join(ensembl_to_entrez, by = c("ENSEMBL" = "ensembl_gene_id"))
res_df# remove rows with NAs in the columns in `entrezgene_id` and `padj`
res_df <- res_df %>%
drop_na(entrezgene_id, padj)
# are all ENTREZ IDs unique?
length(unique(res_df$entrezgene_id)) # 13711
## [1] 13711
nrow(res_df) # 13730
## [1] 13730
# drop duplicates
res_df <- res_df %>%
arrange(padj) %>%
distinct(entrezgene_id, .keep_all = TRUE)
# are all ENTREZ IDs unique?
length(unique(res_df$entrezgene_id)) == nrow(res_df) # TRUE
## [1] TRUE
# perform ORA
# we need a vector of ENTREZ IDs for genes with padj values < 0.05
sig_genes <- res_df %>%
filter(padj < 0.05, log2FoldChange > 1) %>%
pull(entrezgene_id)
head(sig_genes)
## [1] 79413 55270 1474 7533 54974 2013
go_ora <- enrichGO(gene = as.character(sig_genes),
OrgDb = org.Hs.eg.db,
universe = as.character(res_df$entrezgene_id),
ont = "MF",
readable = TRUE) # maps gene IDs to gene names
head(go_ora)
# Visualization
# dot plot
go_ora %>%
dotplot(showCategory = 30) + # showCategory = number of enriched terms to display
ggtitle("dotplot for ORA")
## wrong orderBy parameter; set to default `orderBy = "x"`
# or make your own plot!
ora_plot_interactive <- go_ora@result %>%
dplyr::slice(1L:20L) %>% # show the first 20 enriched terms
dplyr::mutate(GeneRatio = sapply(GeneRatio, function(x) eval(parse(text = x)))) %>% # compute decimal ratios) %>%
# NOTE: Here we have switch from the pipe (%>%) to the "+" sign because ggplot2 only uses "+" (so far...)
ggplot(aes(x = reorder(ID, GeneRatio),
y = GeneRatio,
fill = p.adjust,
label = Description)) +
geom_col() +
coord_flip() +
labs(x = "Enriched Term",
y = "Gene Ratio") +
theme_minimal()
plotly::ggplotly(ora_plot_interactive) # plotly CRAN package allows us to create interactive ggplot2 plots very easy!